library(here)
library(tidyverse)
source(here("utils.R"))
Anonymize data
data_path <- here("experiments", "blocksworld-prior", "results", "data-raw",
"results_anonymized_12_blocksworld-prior_BG.csv");
data <- preprocess_data(data_path)
## [1] "read data from: /home/britta/UNI/Osnabrueck/conditionals-blocksworld/experiments/blocksworld-prior/results/data-raw/results_anonymized_12_blocksworld-prior_BG.csv"
## Parsed with column specification:
## cols(
## .default = col_character(),
## submission_id = col_double(),
## RT = col_double(),
## age = col_double(),
## endTime = col_double(),
## experiment_id = col_double(),
## question = col_logical(),
## startTime = col_double(),
## timeSpent = col_double(),
## trial_number = col_double()
## )
## See spec(...) for full column specifications.
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 1450
## rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
## 20, ...].
# discard training trials
data <- data %>% filter(trial_name == "slider_main")
nrow(data)
## [1] 5000
STIMULI <- data %>% ungroup() %>% pull(stimulus_id) %>% unique()
TARGET_DIR <- here("experiments", "blocksworld-prior", "results", "plots")
dir.create(TARGET_DIR, recursive = TRUE, showWarnings = FALSE)
Duration, age, gender etc.
# duration
df <- data %>% ungroup() %>% select(participant_id, timeSpent, age, gender) %>%
group_by(participant_id) %>%
distinct()
p <- df %>%
ggplot(aes(x=factor(0), y=timeSpent)) + geom_boxplot()
# TODO: points are added twice?!
p + geom_jitter(shape=16) +
geom_hline(aes(yintercept = mean(df$timeSpent) + sd(df$timeSpent),
color="yellow")) +
geom_hline(aes(yintercept = mean(df$timeSpent) + 2 * sd(df$timeSpent),
color="red")) +
theme(legend.position="none")
df %>% ungroup() %>% group_by(participant_id) %>% summary()
## participant_id timeSpent age gender
## participant1 : 1 Min. : 7.279 Min. :19.00 female:26
## participant10: 1 1st Qu.:12.355 1st Qu.:27.00 male :24
## participant11: 1 Median :14.458 Median :33.50
## participant12: 1 Mean :17.000 Mean :35.10
## participant13: 1 3rd Qu.:17.748 3rd Qu.:42.75
## participant14: 1 Max. :57.989 Max. :63.00
## (Other) :44
# Reaction time per experimental trial
df <- data %>% ungroup() %>% select(participant_id, stimulus_id, RT) %>%
filter(!is.na(stimulus_id) & !is.na(RT)) %>%
group_by(stimulus_id) %>%
distinct()
# TODO: points are added twice
p <- df %>% ggplot(aes(x=stimulus_id, y=RT)) + geom_boxplot() +
geom_hline(aes(yintercept = mean(df$RT) + sd(df$RT), color="yellow")) +
geom_hline(aes(yintercept = mean(df$RT) + 2* sd(df$RT), color="red")) +
theme(legend.position="none", axis.text.x = element_text(angle=90)) +
ggtitle("Reaction times per stimulus")
ggsave(paste(TARGET_DIR, "Reaction-times.jpg", sep=.Platform$file.sep), p,
width=6, height=4)
p
data %>% ungroup() %>% select(comments) %>% unique()
Discard trials were reaction time was more than 3 minutes or less than 5 seconds (remember there were 4 sliders to move in one trial)
filter if something went wrong according to comments
data_filtered <- data %>%
filter(!(comments=="My final two questions, didn't have an example to show me." &
(trial_number == 24 | trial_number == 25)))
nrow(data_filtered)
## [1] 4992
# TODO: something went wrong with reaction times! (only first trial is saved)
data_filtered <- data_filtered %>% filter(is.na(RT) | RT <= 3 * 60 * 1000)
nrow(data_filtered)
## [1] 4982
data_filtered <- data_filtered %>% filter(is.na(RT) | RT > 5000)
nrow(data_filtered)
## [1] 4971
df <- data_filtered %>%
select(-utt_idx, -trial_name, -trial_number, -timeSpent, -gender, -age,
-comments, -RT)
# match colors and blocks depending on color-group
data_processed <- df %>%
group_by(participant_id, stimulus_id, color_group) %>%
mutate(utterance = case_when(color_group=="group1" & utterance=="b" ~ "a",
color_group=="group1" & utterance=="g" ~ "c",
color_group=="group2" & utterance=="b" ~ "c",
color_group=="group2" & utterance=="g" ~ "a",
utterance=="bg" ~ "ac",
utterance=="none" ~ "none"
),
utterance=factor(utterance, levels = c("a", "c", "ac", "none")),
response = response/100) %>%
ungroup() %>%
select(-color_group)
df <- data_processed %>%
group_by(participant_id, stimulus_id) %>%
filter(sum(response) != 0)
nrow(df)
## [1] 4946
data_normalized <- df %>%
mutate(n=sum(response), response=response/n)
data_normalized
stopifnot(data_normalized %>% filter(is.na(response)) %>% nrow() == 0)
After normalizing, check whether minimal requirements are fulfilled.
If at least one block clearly falls/doesn’t fall (touch the ground), but participant put high probability on none (neither block touches the ground) or ac (both touch the ground), discard trial, in these cases participants cannot have been concentrated.
This doesn’t really feel good - in future, I might use control trials, e.g. after each fifth trial, a control trial (i.e.5 in total) and if more than two of the control trials aren’t answered meaningful, discard entire data of participant?! Or maybe it’s necessary to make the pictures clearer by adjusting the blocks positions for a high/low/uncertain prior to fall to make them distinguishable more easily.
The following pictures show all scenes for which I specified such a requirement. Below are the stimulus ids of those scenes of which some trials had to be discarded.
“S1-121”
“S10-203”
“S12-203”
“S15-443”
“S20-468”
“S22-468”
“S30-805”
“S32-806”
“S7-130”
“S7-130-dep”
fn <- "scenes_luh_annotations.csv"
min.require <- read_csv(here("experiments", "stimuli", fn)) %>%
select("req.exp1.not.small", "req.exp1.not.large", id) %>%
filter((!is.na(req.exp1.not.small) | !is.na(req.exp1.not.large)))
check <- function(data_wide, stimulus){
req <- min.require %>% filter(id== (!!stimulus))
dat <- tibble()
if(nrow(req) != 0){
not_small <- req$`req.exp1.not.small`
not_large <- req$`req.exp1.not.large`
dat <- data_wide %>% filter(stimulus_id==(!!stimulus))
if(!is.na(not_small)) {
dat <- dat %>% filter(!!sym(not_small) < 0.25)
}
if(!is.na(not_large)){
dat <- dat %>% filter(!!sym(not_large) > 0.75)
}
}
return(dat)
}
# some trials have less than 4 utterances since some single responses were
# excluded, e.g. due to RT. With pivot_wider, we therefore get NA-values,
# which need to be filtered out again
data_normalized_wide <- data_normalized %>%
pivot_wider(names_from = "utterance", values_from = "response") %>%
filter(!is.na(ac) & !is.na(a) & !(is.na(c)) & !is.na(none))
critical <- tibble()
for (s in STIMULI){
t <- check(data_normalized_wide, s)
critical <- bind_rows(critical, t)
}
critical
critical %>% select(stimulus_id) %>% unique() %>% pull(stimulus_id)
## [1] S7-130-dep S32-806 S30-805
## 25 Levels: S1-121 S10-203 S12-203 S15-443 S20-468 S22-468 ... S97-1674
data_normalized_wide <- anti_join(data_normalized_wide, critical)
data_normalized <- data_normalized_wide %>%
pivot_longer(cols = c("a", "c", "ac", "none"), names_to = "utterance",
values_to = "response")
# total and relative nb included responses
nrow(data_normalized)
## [1] 4856
nrow(data_normalized) / nrow(data)
## [1] 0.9712
fn <- here("experiments", "blocksworld-prior", "results", "data-processed",
"tables_experimental.csv")
write.table(data_normalized , file = fn, sep = ",", row.names=FALSE)
means <- data_normalized %>%
group_by(stimulus_id, utterance) %>%
summarise(mean=mean(response))
fn <- here("experiments", "blocksworld-prior", "results", "data-processed",
"tables_experimental_means.csv")
write.table(means, file = fn, sep = ",", row.names=FALSE)
# Look for participants who constantly gave all-or-none responses
df_wide <- data_normalized %>%
pivot_wider(names_from = utterance, names_prefix = "utt.", values_from = response) %>%
ungroup() %>%
filter(n==1)
df <- df_wide %>%
group_by(participant_id) %>%
summarise(n_all_none_stimuli = n())
df %>% ggplot(aes(n_all_none_stimuli)) + geom_histogram(binwidth = 1)
# which stimuli were rated as all-or-none by how many participants?
df <- df_wide %>%
group_by(stimulus_id) %>%
summarise(n_participants = n())
df %>% ggplot(aes(x=stimulus_id, y=n_participants)) + geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1))
Plot the data
ids <- data_normalized %>% pull(stimulus_id) %>% unique()
data_normalized <- data_normalized %>%
mutate(utterance = factor(utterance, levels = c("ac", "a", "c", "none")))
labels <- c(ac="Blue and Green", a="Blue and ¬Green", c="¬Blue and Green",
none="¬Blue and ¬Green")
for (s in STIMULI){
df <-data_normalized %>% filter(stimulus_id == s)
df_means <- df %>% group_by(utterance) %>% summarise(m=mean(response),
med=median(response))
p <- df %>%
ggplot(aes(x=factor(0), y=response, fill=utterance)) +
geom_violin(alpha=0.5) +
geom_jitter(width = 0.2, alpha=0.5) +
geom_point(data=df_means, mapping=aes(x = factor(0), y = m), col="red") +
geom_point(data=df_means, mapping=aes(x=factor(0), y = med),col="yellow") +
coord_flip() +
labs(y="", x="") +
theme_classic() +
facet_wrap(~utterance, labeller = labeller(utterance=labels)) +
# ggtitle(s) +
theme(legend.position = "none", axis.text.y=element_blank(),
axis.ticks.y =element_blank(),
text = element_text(size=20),
panel.spacing = unit(2, "lines"))
fn <- paste("responses-", s, ".png", sep="")
ggsave(paste(TARGET_DIR, fn, sep=.Platform$file.sep), p, width=6, height=4)
print(p)
}
fn <- here("experiments", "blocksworld-prior", "results", "data-processed",
"tables_experimental.csv")
dat.experiment1 <- read_csv(fn)
## Parsed with column specification:
## cols(
## participant_id = col_character(),
## stimulus_id = col_character(),
## n = col_double(),
## utterance = col_character(),
## response = col_double()
## )
dat <- dat.experiment1 %>%
pivot_wider(names_from = utterance, values_from = response) %>%
add_probs() %>%
pivot_longer(cols=starts_with("p_"), names_to = "key", values_to = "prob")
PlotDensity <- function(dat, p1, p2, save_as){
p <- dat %>%
filter(key==p1 | key == p2) %>%
ggplot(aes(prob, fill=key)) +
geom_density(alpha=0.5)
ggsave(paste(TARGET_DIR, save_as, sep=.Platform$file.sep), p, width=6, height=4)
return(p)
}
dat %>% PlotDensity("p_a_given_c", "p_c_given_a", "p-conditionals.png")
dat %>% PlotDensity("p_a", "p_c", "p-marginals.png")
dat %>% PlotDensity("p_na_given_nc", "p_nc_given_na", "p-neg-conditionals.png")
dat %>% PlotDensity("p_na", "p_nc", "p-neg-marginals.png")